PE with flexible adjustments

if(exists("con")) {
  dbDisconnect(con)
  remove(list=ls())
}
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(DBI)
library(ggsci)
library(DT)
library(ggrepel)
library(cowplot)
library(reactable)
library(gt)
library(broom)
con <- DBI::dbConnect(RSQLite::SQLite(), dbname='../db/pe_summary_stats_02_2024-v2.sqlite')
varnames <- tbl(con, "variable_names_epcf")
adjusted_meta <- tbl(con, "adjusted_meta")
#mvr2 <- tbl(con, 'mvr2') |> mutate(mv = mve_rsq-base_rsq)
mvr2 <- read_rds('../pe_summary_060623/mvr2.rds') |> mutate(mv = mve_rsq-base_rsq)
pe <- tbl(con, "pe")
pe_r2 <- tbl(con, "rsq")
glanced <- tbl(con, "glanced")
glanced <- glanced |> left_join(pe_r2 |> select(exposure, phenotype, series, model_number,aggregate_base_model, rsq, adj.r2), by=c("exposure", "phenotype", "model_number", "aggregate_base_model", "series"))
variable_domain <- tbl(con, "variable_domain")
expos <- pe |> filter(term %like% 'expo%') |> rename(evarname=exposure, pvarname=phenotype)
expos_wide <- expos |> pivot_wider(names_from = "model_number", values_from = c("estimate", "std.error", "statistic", "p.value")) 
glanced_wide <- glanced |> select(-c(adj.r2, df.residual, null.deviance, df.null, deviance)) |> pivot_wider(names_from=c("model_number", "aggregate_base_model"), values_from = c("rsq", "nobs", "AIC", "BIC")) |>  rename(evarname=exposure, pvarname=phenotype)

glanced_wide <- glanced_wide |> mutate(rsq_adjusted_base_diff=rsq_2_0-rsq_2_1, rsq_adjusted_diff = rsq_2_0-rsq_1_0) 

expos_wide <- expos_wide |> left_join(glanced_wide |> select(-c(series, log_p, log_e, scaled_p, scaled_e)), by=c("evarname", "pvarname", "exposure_table_name", "phenotype_table_name"))
expos_wide <- expos_wide |> left_join(varnames, by=c("evarname"="Variable.Name", "exposure_table_name"="Data.File.Name"))
expos_wide <- expos_wide |> left_join(varnames |> select(Variable.Name, Data.File.Name, Variable.Description, Data.File.Description), 
                                      by=c("pvarname"="Variable.Name", "phenotype_table_name"="Data.File.Name"))


expos_wide <- expos_wide |> collect() |> select(-Use.Constraints) |> rename(e_data_file_desc=Data.File.Description.x, p_data_file_desc=Data.File.Description.y,e_variable_description=Variable.Description.x, p_variable_description=Variable.Description.y)

expos_wide_summary <- expos_wide |> filter(term == 'expo' | term == 'expo1' | term == 'expo2') |> group_by(evarname, pvarname) |> summarize(mean_adjusted_base_r2_diff = mean(rsq_adjusted_base_diff), mean_unadjusted_r2_diff=mean(rsq_adjusted_diff), total_n = sum(nobs_2_0)) |> ungroup()
## `summarise()` has grouped output by 'evarname'. You can override using the
## `.groups` argument.
adjusted_meta_full <- adjusted_meta |> filter(model_number == 2) |> collect() |> left_join(expos_wide_summary, by=c("evarname", "pvarname")) ## fully adjusted model

p_variable_domain <- variable_domain |> filter(epcf == 'p') |> collect() |> group_by(Variable.Name) |> summarise(pvardesc=first(Variable.Description),pcategory=first(category),psubcategory=first(subcategory))
e_variable_domain <- variable_domain |> filter(epcf == 'e') |> collect() |> group_by(Variable.Name) |> summarise(evardesc=first(Variable.Description),ecategory=first(category),esubcategory=first(subcategory))

adjusted_meta_full <- adjusted_meta_full |> left_join(p_variable_domain, by=c("pvarname"="Variable.Name"))
adjusted_meta_full <- adjusted_meta_full |> left_join(e_variable_domain, by=c("evarname"="Variable.Name"))

expos_wide <- expos_wide |> left_join(p_variable_domain, by=c("pvarname"="Variable.Name"))
expos_wide <- expos_wide |> left_join(e_variable_domain, by=c("evarname"="Variable.Name"))

mvr2 <- mvr2 |> collect() |> left_join(p_variable_domain, by=c("pvarname"="Variable.Name"))

Number of unique exposures and phenotypes

num_e <- length(unique(adjusted_meta_full$evarname))
num_p <- length(unique(adjusted_meta_full$pvarname))
num_e
## [1] 854
num_p
## [1] 319
num_e * num_p
## [1] 272426

Number exposures and phenotypes and associations in X number of surveys

#adjusted_meta <- adjusted_meta |> unnest(glanced) |> unnest(tidied)
n_obss <- sort(unique(adjusted_meta_full$nobs))

num_tests <- map_df(n_obss, function(n) {
  n_e <- adjusted_meta_full |> filter(nobs == n) |> pull(evarname) |> unique() |> length()
  n_p <- adjusted_meta_full |> filter(nobs == n) |> pull(pvarname) |> unique() |> length()
  nn <- nrow(adjusted_meta_full |> filter(nobs == n))
  tibble(n_expos=n_e, n_phenos=n_p, n_pxe=nn)
})

num_tests  |> mutate(n_surveys=n_obss) |> gt()
n_expos n_phenos n_pxe n_surveys
854 319 64488 1
649 278 32439 2
528 241 17743 3
414 143 11084 4
355 117 4569 5
324 79 3736 6
205 64 2249 7
177 60 1602 8
28 60 593 9
6 29 60 10

Keep number of surveys is greater than 2

adjusted_meta_2 <- adjusted_meta_full |> filter(nobs >= 2)
n_evars <- length(unique(adjusted_meta_2$evarname))
n_pvars <- length(unique(adjusted_meta_2$pvarname))

Sample sizes within and across all surveys

sample_size_per_pair <- expos_wide |> filter(term == 'expo' | term== 'expo1') |> group_by(evarname, pvarname) |> summarize(total_n=sum(nobs_2_0), n_surveys=n(), median_n=median(nobs_2_0))
## `summarise()` has grouped output by 'evarname'. You can override using the
## `.groups` argument.

Summary of the summary stats

adjusted_meta_2 <- adjusted_meta_2 |> ungroup() |>  mutate(pval_BY=p.adjust(p.value, method="BY"), pvalue_bonferroni=p.adjust(p.value, method="bonferroni"))
adjusted_meta_2 <- adjusted_meta_2 |> mutate(sig_levels = case_when(
  pvalue_bonferroni < 0.05 ~ 'Bonf.<0.05',
  pval_BY < 0.05 ~ 'BY<0.05',
  TRUE ~ '> BY & Bonf.'
))

bonf_thresh <- 0.05/nrow(adjusted_meta_2)
quantile(adjusted_meta_2$p.value, probs=c(0.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99), na.rm = T)
##           1%           5%          10%          20%          30%          40% 
## 1.416929e-26 1.084879e-08 4.655408e-05 6.992836e-03 4.441579e-02 1.211390e-01 
##          50%          60%          70%          80%          90%          95% 
## 2.337699e-01 3.689430e-01 5.190798e-01 6.756578e-01 8.373072e-01 9.183359e-01 
##          99% 
## 9.838565e-01
quantile(adjusted_meta_2$estimate, probs=c(0.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99), na.rm = T)
##           1%           5%          10%          20%          30%          40% 
## -0.246857718 -0.093162824 -0.061155239 -0.034324215 -0.020507593 -0.010977694 
##          50%          60%          70%          80%          90%          95% 
## -0.002793128  0.005238097  0.015238188  0.029132279  0.057786585  0.100187654 
##          99% 
##  0.301700609
sum(adjusted_meta_2$pvalue_bonferroni < 0.05)/nrow(adjusted_meta_2) 
## [1] 0.06726966
adjusted_meta_2 |> group_by(sig_levels) |> count()
adjusted_meta_2 |> filter(sig_levels == 'BY<0.05') |> arrange(-p.value) |> head()
adjusted_meta_2 |> group_by(sig_levels) |> summarize(r2_25=quantile(mean_adjusted_base_r2_diff, probs=.25, na.rm = T),
                                                     r2_50=quantile(mean_adjusted_base_r2_diff, probs=.5, na.rm = T),
                                                     r2_75=quantile(mean_adjusted_base_r2_diff, probs=.75, na.rm = T),
                                                      r2_100=max(mean_adjusted_base_r2_diff, na.rm = T))
adjusted_meta_2 |> filter(sig_levels == 'Bonf.<0.05') |> arrange(-mean_adjusted_base_r2_diff)
## qqplot
pval_qq <- data.frame(observed = sort(adjusted_meta_2$p.value), expected = (1:nrow(adjusted_meta_2))/nrow(adjusted_meta_2))
qq_p <- ggplot(pval_qq, aes(-log10(expected), -log10(observed)))
qq_p <- qq_p + geom_point()
##

p <- ggplot(pval_qq, aes(observed))
p <- p + geom_histogram(bins=100) + theme_bw()
p <- p + geom_hline(yintercept = 1, color='blue')
p <- p + xlab("P-E association pvalue")
p_hist <- p
p_hist

p <- ggplot(pval_qq |> filter(observed < 1e-3), aes(-log10(observed)))
p <- p + geom_histogram(bins=200) + theme_bw() + scale_x_continuous(limits=c(0, 100))
p <- p + geom_hline(yintercept = 1, color='blue')
p <- p + xlab("P-E association pvalue")
p_hist <- p
p_hist
## Warning: Removed 53 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

p <- ggplot(adjusted_meta_2, aes(p.value))
p <- p + geom_density() + theme_bw() + facet_grid(~ecategory)
p1 <- p + xlab("P-E association pvalue")


p <- ggplot(adjusted_meta_2, aes(p.value))
p <- p + geom_density() + theme_bw() + facet_grid(~pcategory)
p2 <- p + xlab("P-E association pvalue")

plot_grid(p1, p2, nrow=2, labels=c("A", "B"))

zoom in the distribution

p_plot <- adjusted_meta_2 |> select(ecategory, pcategory, p.value) 

p <- ggplot(p_plot,aes(p.value))
p <- p + geom_histogram(aes(y=..density..)) + geom_density()  + theme_bw() + facet_grid(~ecategory)
p1 <- p + xlab("P-E association pvalue")


p <- ggplot(p_plot, aes(p.value))
p <- p + geom_histogram(aes(y=..density..)) + geom_density() + theme_bw() + facet_grid(~pcategory)
p2 <- p + xlab("P-E association pvalue")

plot_grid(p1, p2, nrow=2, labels=c("A", "B"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Association Size vs. -log10(pvalue)

e_summary <- adjusted_meta_2 |> group_by(evarname) |> arrange(pvalue_bonferroni) |>  
  summarize(mean_r2=mean(mean_adjusted_base_r2_diff, na.rm=T),  mean_estimate=mean(abs(estimate), na.rm=T), 
            median_pvalue=median(p.value, na.rm=T), n_sig=sum(pvalue_bonferroni < 0.05, na.rm=T), 
            n_tests=sum(!is.na(pvalue_bonferroni)),  median_i.squared=median(i.squared, na.rm=T),
            max_r2=first(mean_adjusted_base_r2_diff), max_pvarname=first(pvarname) , max_estimate=first(estimate), max_p.value=first(p.value)) |> mutate(n_sig_pct=n_sig/n_tests)


p_summary <- adjusted_meta_2 |> group_by(pvarname) |> arrange(pvalue_bonferroni) |> 
  summarize(mean_r2=mean(mean_adjusted_base_r2_diff, na.rm=T), mean_estimate=mean(abs(estimate), na.rm=T),
            median_pvalue=median(p.value, na.rm=T), n_sig=sum(pvalue_bonferroni < 0.05, na.rm=T), 
            n_tests=sum(!is.na(pvalue_bonferroni)),  median_i.squared=median(i.squared, na.rm=T),
            max_r2=first(mean_adjusted_base_r2_diff), max_evarname=first(evarname) , max_estimate=first(estimate), max_p.value=first(p.value)) |> mutate(n_sig_pct=n_sig/n_tests)

## deeper summary by group

p_group_summary <- adjusted_meta_2 |> unite(p_scategory, c(pcategory, psubcategory)) |> group_by(p_scategory) |> arrange(pvalue_bonferroni) |>  
  summarize(mean_r2=mean(mean_adjusted_base_r2_diff, na.rm=T),  mean_estimate=mean(abs(estimate), na.rm=T), 
            median_pvalue=median(p.value, na.rm=T), n_sig=sum(pvalue_bonferroni < 0.05, na.rm=T), 
            n_tests=sum(!is.na(pvalue_bonferroni)),  median_i.squared=median(i.squared, na.rm=T),
            max_r2=first(mean_adjusted_base_r2_diff), max_evarname=first(evarname) , max_estimate=first(estimate), max_p.value=first(p.value)) |> mutate(n_sig_pct=n_sig/n_tests)


e_group_summary <- adjusted_meta_2 |> unite(e_scategory, c(ecategory, esubcategory)) |> group_by(e_scategory) |> arrange(pvalue_bonferroni) |>  
  summarize(mean_r2=mean(mean_adjusted_base_r2_diff, na.rm=T),  
            mean_abs_estimate=mean(abs(estimate), na.rm=T),
            mean_estimate=mean((estimate), na.rm=T),
            median_pvalue=median(p.value, na.rm=T), 
            n_sig=sum(pvalue_bonferroni < 0.05, na.rm=T), 
            n_tests=sum(!is.na(pvalue_bonferroni)),  
            median_i.squared=median(i.squared, na.rm=T),
            max_r2=first(mean_adjusted_base_r2_diff), 
            max_pvarname=first(pvarname), 
            max_estimate=first(estimate), max_p.value=first(p.value)) |> mutate(n_sig_pct=n_sig/n_tests)

Tables 1 and 2

## 

remove_units_from_string <- function(vardesc) {
  gsub("\\(.*\\)$","", vardesc)
}
e_group_summary <- adjusted_meta_2 |> filter(term_name %in% c('expo', 'expo1', 'expo2', 'expo3')) |> unite(e_scategory, c(ecategory, esubcategory)) |> group_by(e_scategory) |> arrange(pvalue_bonferroni,mean_adjusted_base_r2_diff) |>  
  summarize(
            #median_r2=mean(mean_adjusted_base_r2_diff, na.rm=T),  
            #median_abs_estimate=median(abs(estimate), na.rm=T),
            n_sig=sum(pvalue_bonferroni < 0.05, na.rm=T), 
            n_tests=sum(!is.na(pvalue_bonferroni)),  
            #median_i.squared=median(i.squared, na.rm=T),
            max_r2=first(mean_adjusted_base_r2_diff), 
            max_termname=remove_units_from_string(first(term_name)),
            max_pvarname=remove_units_from_string(first(pvardesc)),
            max_evarname=remove_units_from_string(first(evardesc)),
            max_estimate=first(estimate), max_p.value=first(p.value), max_i.squared=first(i.squared)) |> mutate(n_sig_pct=n_sig/n_tests)

e_bonf_group_summary <- adjusted_meta_2 |> filter(sig_levels == 'Bonf.<0.05') |> 
  unite(e_scategory, c(ecategory, esubcategory)) |>
  mutate(sgn=ifelse(sign(estimate) <0, 'neg', 'pos')) |> group_by(e_scategory, sgn) |> arrange(pvalue_bonferroni) |>  
  summarize(median_bonf_r2=median(mean_adjusted_base_r2_diff, na.rm=T),  
            q25_bonf_estimate=quantile(estimate, probs=.25, na.rm=T),
            median_bonf_estimate=median((estimate), na.rm=T),
            q75_bonf_estimate=quantile(estimate, probs=.75, na.rm=T)
  )
## `summarise()` has grouped output by 'e_scategory'. You can override using the
## `.groups` argument.
e_bonf_group_summary <- e_bonf_group_summary |> pivot_wider(names_from=sgn, values_from=(c(median_bonf_r2, q25_bonf_estimate, median_bonf_estimate, q75_bonf_estimate)))

## merge
e_group_summary <- e_group_summary |> left_join(e_bonf_group_summary, by='e_scategory')
e_group_summary <- e_group_summary |> separate(col=e_scategory, into=c("escategory", "esubcategory"), sep="_")
e_group_summary <- e_group_summary |> filter(n_sig > 1)

e_group_summary <- e_group_summary |> select(escategory, esubcategory, n_tests, n_sig_pct, median_bonf_r2_neg, median_bonf_r2_pos, q25_bonf_estimate_neg, median_bonf_estimate_neg, q75_bonf_estimate_neg, q25_bonf_estimate_pos, median_bonf_estimate_pos, q75_bonf_estimate_pos, 
max_pvarname, max_evarname, max_estimate, max_r2, max_p.value, max_i.squared)

e_group_summary<- e_group_summary |> gt() |> 
  fmt_number(columns = c(starts_with("q"), starts_with("med")), decimals = 3) |>
  fmt_number(columns = c(max_r2, max_estimate), decimals = 3) |>
  fmt_number(columns = c(n_sig_pct), decimals = 2) |>
  fmt_number(columns = c(max_i.squared), decimals = 0) |>
  fmt_scientific(columns = max_p.value, decimals = 0)

e_group_summary <- e_group_summary |> 
  tab_spanner(label = "Effect Sizes < 0",columns = ends_with("neg")) |>
  tab_spanner(label = "Effect Sizes > 0",columns = ends_with("pos"))

e_group_summary <- e_group_summary |> 
  tab_spanner(label = "Example P-E Association (lowest p.value for E domain)",columns = starts_with("max"))

e_group_summary |> gtsave('./e_group_summary_v2.html')
e_group_summary
escategory esubcategory n_tests n_sig_pct Effect Sizes < 0 Effect Sizes > 0 Example P-E Association (lowest p.value for E domain)
median_bonf_r2_neg q25_bonf_estimate_neg median_bonf_estimate_neg q75_bonf_estimate_neg median_bonf_r2_pos q25_bonf_estimate_pos median_bonf_estimate_pos q75_bonf_estimate_pos max_pvarname max_evarname max_estimate max_r2 max_p.value max_i.squared
allergy NA 404 0.01 NA NA NA NA 0.002 0.323 0.488 0.648 Ward's triangle bone mineral content Tissue transglutaminase 0.650 0.002 1 × 10−10 0
housing NA 534 0.01 0.001 −0.499 −0.341 −0.289 0.001 0.175 0.260 0.386 Alkaline phosphotase Is this {mobile home/house/apartment} owned, being bought, rented, or occup −0.102 0.002 8 × 10−15 33
infection NA 4783 0.04 0.002 −1.018 −0.535 −0.197 0.002 0.109 0.143 0.342 Upper Arm Length Hepatitis D 0.568 0.000 6 × 10−79 0
nutrients dietary biomarker 4949 0.21 0.009 −0.149 −0.103 −0.076 0.012 0.079 0.113 0.178 Cholesterol g-tocopherol 0.265 0.071 4 × 10−201 0
nutrients dietary interview 14246 0.05 0.003 −0.089 −0.066 −0.051 0.004 0.058 0.076 0.104 Direct HDL-Cholesterol Alcohol 0.206 0.038 3 × 10−114 7
nutrients supplements 10466 0.01 0.005 −0.125 −0.075 −0.060 0.007 0.079 0.107 0.135 Serum total folate Any Dietary Supplements taken in the past 24 hour? −0.519 0.053 7 × 10−159 0
physical activity NA 2826 0.17 0.005 −0.095 −0.073 −0.052 0.004 0.048 0.064 0.106 60 sec. pulse (30 sec. pulse * 2): Vigorous Leisure Per Week −0.183 0.027 6 × 10−87 0
pollutant amide 198 0.29 0.007 −0.107 −0.081 −0.059 0.006 0.069 0.079 0.127 Mean cell volume Acrylamide 0.148 0.023 5 × 10−38 0
pollutant amine 156 0.16 0.011 −0.137 −0.119 −0.101 0.016 0.145 0.164 0.184 Glycohemoglobin o-Toluidine, urine 0.203 0.018 2 × 10−17 27
pollutant diakyl 2148 0.04 0.004 −0.088 −0.069 −0.062 0.008 0.062 0.099 0.120 Creatinine Urinary nitrate −0.180 0.019 6 × 10−49 0
pollutant heavy metals 6581 0.13 0.007 −0.125 −0.102 −0.080 0.006 0.062 0.085 0.114 Arm Circumference Lead −0.164 0.019 3 × 10−136 0
pollutant hydrocarbon 1647 0.12 0.007 −0.119 −0.100 −0.081 0.007 0.084 0.104 0.133 White blood cell count 2-napthol 0.216 0.029 6 × 10−51 0
pollutant organochlorine 4885 0.04 0.032 −0.344 −0.266 −0.148 0.022 0.130 0.178 0.273 Arm Circumference PCB194 −0.374 0.050 1 × 10−45 0
pollutant organophosphate 423 0.03 0.000 −0.027 −0.018 −0.013 NA NA NA NA Triceps Skinfold Dimethoate −0.022 0.000 8 × 10−35 0
pollutant phenols 2173 0.06 0.008 −0.124 −0.102 −0.076 0.006 0.066 0.084 0.101 Weight Ethyl paraben −0.124 0.014 4 × 10−50 0
pollutant phthalates 2306 0.07 0.005 −0.110 −0.086 −0.069 0.005 0.060 0.080 0.094 Standing Height Mono-n-butyl phthalate −0.111 0.008 2 × 10−47 1
pollutant polyfluoro 2346 0.06 0.012 −0.131 −0.115 −0.082 0.012 0.094 0.113 0.151 Hemoglobin Perfluorohexane sulfonic acid 0.113 0.015 2 × 10−34 0
pollutant priority pesticide 2502 0.04 0.001 −0.066 −0.028 −0.011 0.000 0.011 0.016 0.044 Glycohemoglobin Nicosulfuron 0.016 0.000 1 × 10−172 0
pollutant VOC 4492 0.06 0.006 −0.108 −0.085 −0.023 0.003 0.020 0.052 0.106 Mean cell volume Blood Chlorobenzene Result 0.025 0.001 0 4
smoking smoking behavior 1873 0.04 0.008 −0.286 −0.213 −0.165 0.009 0.129 0.186 0.283 Mean cell volume Smoking: current, ever, never 0.482 0.033 4 × 10−148 28
smoking smoking biomarker 1088 0.09 0.004 −0.087 −0.074 −0.059 0.010 0.070 0.108 0.168 Segmented neutrophils num Cotinine 0.164 0.023 9 × 10−176 2
## 
p_group_summary <- adjusted_meta_2 |> filter(term_name %in% c('expo', 'expo1', 'expo2', 'expo3')) |> filter(mean_adjusted_base_r2_diff <= .1) |> unite(p_scategory, c(pcategory, psubcategory)) |> group_by(p_scategory) |> arrange(pvalue_bonferroni,mean_adjusted_base_r2_diff) |>  
  summarize(
            #median_r2=mean(mean_adjusted_base_r2_diff, na.rm=T),  
            #median_abs_estimate=median(abs(estimate), na.rm=T),
            n_sig=sum(pvalue_bonferroni < 0.05, na.rm=T), 
            n_tests=sum(!is.na(pvalue_bonferroni)),  
            #median_i.squared=median(i.squared, na.rm=T),
            max_r2=first(mean_adjusted_base_r2_diff), 
            max_termname=remove_units_from_string(first(term_name)),
            max_pvarname=remove_units_from_string(first(pvardesc)),
            max_evarname=remove_units_from_string(first(evardesc)),
            max_estimate=first(estimate), max_p.value=first(p.value), max_i.squared=first(i.squared)) |> mutate(n_sig_pct=n_sig/n_tests)

p_bonf_group_summary <- adjusted_meta_2 |> filter(sig_levels == 'Bonf.<0.05') |> 
  unite(p_scategory, c(pcategory, psubcategory)) |>
  mutate(sgn=ifelse(sign(estimate) <0, 'neg', 'pos')) |> group_by(p_scategory, sgn) |> arrange(pvalue_bonferroni) |>  
  summarize(median_bonf_r2=median(mean_adjusted_base_r2_diff, na.rm=T),  
            q25_bonf_estimate=quantile(estimate, probs=.25, na.rm=T),
            median_bonf_estimate=median((estimate), na.rm=T),
            q75_bonf_estimate=quantile(estimate, probs=.75, na.rm=T)
  )
## `summarise()` has grouped output by 'p_scategory'. You can override using the
## `.groups` argument.
p_bonf_group_summary <- p_bonf_group_summary |> pivot_wider(names_from=sgn, values_from=(c(median_bonf_r2, q25_bonf_estimate, median_bonf_estimate, q75_bonf_estimate)))

## merge
p_group_summary <- p_group_summary |> left_join(p_bonf_group_summary, by='p_scategory')
p_group_summary <- p_group_summary |> separate(col=p_scategory, into=c("pscategory", "psubcategory"), sep="_")
p_group_summary <- p_group_summary |> filter(n_sig > 1)

p_group_summary <- p_group_summary |> select(pscategory, psubcategory, n_tests, n_sig_pct, q25_bonf_estimate_neg, median_bonf_estimate_neg, q75_bonf_estimate_neg, q25_bonf_estimate_pos, median_bonf_estimate_pos, q75_bonf_estimate_pos, 
max_pvarname, max_evarname, max_estimate, max_r2, max_p.value, max_i.squared)

p_group_summary<- p_group_summary |> gt() |> 
  fmt_number(columns = c(starts_with("q"), starts_with("med")), decimals = 3) |>
  fmt_number(columns = c(max_r2, max_estimate), decimals = 3) |>
  fmt_number(columns = c(n_sig_pct), decimals = 2) |>
  fmt_number(columns = c(max_i.squared), decimals = 0) |>
  fmt_scientific(columns = max_p.value, decimals = 0)

p_group_summary <- p_group_summary |> 
  tab_spanner(label = "Effect Sizes < 0",columns = ends_with("neg")) |>
  tab_spanner(label = "Effect Sizes > 0",columns = ends_with("pos"))

p_group_summary <- p_group_summary |> 
  tab_spanner(label = "Example P-E Association (lowest p.value for E domain)",columns = starts_with("max"))

p_group_summary |> gtsave('./p_group_summary_v2.html')
p_group_summary
pscategory psubcategory n_tests n_sig_pct Effect Sizes < 0 Effect Sizes > 0 Example P-E Association (lowest p.value for E domain)
q25_bonf_estimate_neg median_bonf_estimate_neg q75_bonf_estimate_neg q25_bonf_estimate_pos median_bonf_estimate_pos q75_bonf_estimate_pos max_pvarname max_evarname max_estimate max_r2 max_p.value max_i.squared
anthropometric dexa 24753 0.06 −0.129 −0.102 −0.078 0.073 0.109 0.143 Android fat mass 25-hydroxyvitamin D3 −0.217 0.036 3 × 10−133 0
anthropometric NA 5953 0.15 −0.119 −0.082 −0.056 0.055 0.076 0.147 Arm Circumference cis-b-carotene −0.205 0.037 4 × 10−173 0
biochemistry bone 849 0.08 −0.096 −0.086 −0.074 0.073 0.093 0.124 Total calcium Retinol 0.236 0.046 2 × 10−154 0
biochemistry electrolyte 1925 0.05 −0.102 −0.089 −0.070 0.067 0.091 0.107 Chloride Blood Chlorobenzene Result −0.053 0.003 0 0
biochemistry hormone 3349 0.02 −0.109 −0.089 −0.079 0.044 0.060 0.090 Parathyroid Hormone(Elecys method) pg/mL Vitamin D −0.269 0.058 2 × 10−71 14
biochemistry immunity 385 0.11 −0.114 −0.074 −0.060 0.066 0.081 0.109 Globulin Metsulfuron methyl −0.028 0.001 3 × 10−61 0
biochemistry inflammation 670 0.09 −0.096 −0.075 −0.055 0.089 0.111 0.127 C-reactive protein cis-b-carotene −0.159 0.022 1 × 10−66 0
biochemistry injury 362 0.05 −0.087 −0.067 −0.052 0.052 0.061 0.080 Lactate dehydrogenase LDH Smoking: current, ever, never −0.180 0.005 7 × 10−41 0
biochemistry kidney 1796 0.13 −0.120 −0.093 −0.063 0.079 0.117 0.169 Creatinine Blood Chlorobenzene Result 0.117 0.025 0 0
biochemistry lipids 3278 0.07 −0.097 −0.068 −0.054 0.082 0.199 0.319 Cholesterol g-tocopherol 0.265 0.071 4 × 10−201 0
biochemistry liver 1880 0.07 −0.117 −0.088 −0.054 0.053 0.072 0.090 Alkaline phosphotase Pyridoxal 5'-phosphate −0.115 0.013 8 × 10−84 0
biochemistry liver/kidney 1390 0.08 −0.104 −0.080 −0.053 0.050 0.080 0.113 Albumin Pyridoxal 5'-phosphate 0.252 0.060 1 × 10−84 52
biochemistry metabolic 2743 0.08 −0.118 −0.083 −0.059 0.060 0.110 0.167 Glycohemoglobin Nicosulfuron 0.016 0.000 1 × 10−172 0
biochemistry nutritional status 4019 0.09 −0.144 −0.108 −0.086 0.092 0.123 0.157 Folate, RBC Vitamin B12 0.245 0.057 7 × 10−200 0
blood pressure NA 2710 0.05 −0.098 −0.067 −0.050 0.067 0.088 0.107 60 sec. pulse (30 sec. pulse * 2): Vigorous Leisure Per Week −0.183 0.027 6 × 10−87 0
blood iron 2223 0.04 −0.176 −0.146 −0.101 0.070 0.110 0.148 Iron, refigerated g-tocopherol −0.151 0.021 1 × 10−89 0
blood NA 9004 0.07 −0.094 −0.067 −0.050 0.064 0.095 0.128 Mean cell volume Blood Chlorobenzene Result 0.025 0.001 0 4
fitness NA 663 0.05 −0.148 −0.115 −0.085 0.084 0.123 0.134 Predicted maximal oxygen uptake g-tocopherol −0.156 0.023 5 × 10−76 0
lung exhaled NO 332 0.08 −0.199 −0.166 −0.136 0.055 0.059 0.072 Mean of two reproducible FENO measurements, in parts per billion Smoking: current, ever, never −0.524 0.040 2 × 10−86 0
lung lung function 546 0.07 −0.096 −0.072 −0.063 0.045 0.051 0.072 Baseline 1st Test Spirometry, Forced Expiratory Volume in the first 1.0 sec Cadmium, urine (ng/mL). −0.167 0.013 3 × 10−44 0
microbiome NA 2224 0.02 −0.038 −0.022 −0.020 0.032 0.116 0.149 RB_InverseSimpsonMean Blood Chlorobenzene Result 0.022 0.000 1 × 10−24 0
adjusted_meta_2 <- adjusted_meta_2 |> mutate(p_cap = ifelse(p.value < 1e-30, 1e-30, p.value))
p <- ggplot(adjusted_meta_2 |> filter(ecategory != 'autoantibody'), aes(estimate, -log10(p_cap)))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(-1, 1))
p <- p + facet_grid(ecategory ~ .) + scale_color_npg()
p <- p + geom_hline(yintercept = -log10(.05/nrow(adjusted_meta_2)), color='lightblue')
p <- p + theme_minimal() + theme(legend.position = "none") +ylab('p.value') + xlab("estimate")
p1 <- p

## uBiome only
p <- ggplot(adjusted_meta_2 |> filter(pcategory == 'microbiome'), aes(estimate, -log10(p_cap)))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(-1, 1))
p <- p + facet_grid(ecategory ~ .) + scale_color_npg()
p <- p + geom_hline(yintercept = -log10(.05/nrow(adjusted_meta_2)), color='lightblue')
p <- p + theme_minimal() + theme(legend.position = "none") +ylab('p.value') + xlab("estimate")


p <- ggplot(adjusted_meta_2, aes(estimate, -log10(p_cap)))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(-1, 1))
p <- p + facet_grid(pcategory ~ .) + scale_color_npg()
p <- p + geom_hline(yintercept = -log10(.05/nrow(adjusted_meta_2)), color='lightblue')
p <- p + theme_minimal() + theme(legend.position = "none") +ylab('p.value') + xlab("estimate")
p2 <-p

plot_grid(p1, p2, ncol=2, labels=c("A", "B"))
## Warning: Removed 55 rows containing missing values (geom_point).
## Removed 55 rows containing missing values (geom_point).

e_effect_sizes_per <- adjusted_meta_2 |> filter(sig_levels == 'Bonf.<0.05') |> group_by(ecategory, esubcategory, sign(estimate)) |> summarize(median_pvalue=median(p.value), number_signficant=n(), mean_estimate=mean((estimate))) |> arrange(-mean_estimate)
## `summarise()` has grouped output by 'ecategory', 'esubcategory'. You can
## override using the `.groups` argument.
e_effect_sizes_per <- e_effect_sizes_per |> mutate(esubcategory = ifelse(is.na(esubcategory), ecategory, esubcategory))
p <- ggplot(e_effect_sizes_per, aes(mean_estimate, -log10(median_pvalue), label=esubcategory))
p <- p + geom_point(aes(size=number_signficant)) + geom_text_repel() + geom_vline(xintercept = 0)
p <- p + theme_bw() + xlab("Average(Estimate) within exposome groups") + ylab("Median log10(pvalue)")
p <- p + theme(legend.position = "bottom")
p
## Warning: ggrepel: 19 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

p_effect_sizes_per <- adjusted_meta_2 |> filter(sig_levels == 'Bonf.<0.05') |> group_by(pcategory, psubcategory) |> summarize(mean_r2 = mean(mean_adjusted_base_r2_diff, na.rm=T))
## `summarise()` has grouped output by 'pcategory'. You can override using the
## `.groups` argument.
adjusted_meta_2 <- adjusted_meta_2 |> mutate(vartype = ifelse(term_name == 'expo', 'continuous', 'categorical'))
p <- ggplot(adjusted_meta_2 |> filter(vartype =='categorical'), aes(abs(estimate), color=sig_levels))
p <- p + stat_ecdf() + scale_x_continuous(limits=c(0, .25))
p <- p + xlab("abs(estimate)") + ylab("percentile")  + theme(legend.position="bottom") + scale_color_npg()
p
## Warning: Removed 1574 rows containing non-finite values (stat_ecdf).

p <- ggplot(adjusted_meta_2 |> filter(vartype =='continuous'), aes(abs(estimate), color=sig_levels))
p <- p + stat_ecdf() + scale_x_continuous(limits=c(0, .25))
p <- p + xlab("abs(estimate)") + ylab("percentile") + theme(legend.position="bottom") + scale_color_npg()
p
## Warning: Removed 192 rows containing non-finite values (stat_ecdf).

p <- ggplot(adjusted_meta_2, aes(abs(estimate), color=sig_levels))
p <- p + stat_ecdf() + scale_x_continuous(limits=c(0, .25))
p <- p + xlab("abs(estimate)") + ylab("percentile") + theme(legend.position="bottom") + scale_color_npg()
p
## Warning: Removed 1766 rows containing non-finite values (stat_ecdf).

ecdf_for_sig <- adjusted_meta_2 |> filter(sig_levels == 'Bonf.<0.05') |> pull(mean_adjusted_base_r2_diff) |> ecdf()
ecdf_for_non_sig <- adjusted_meta_2 |> filter(sig_levels == '> BY & Bonf.') |> pull(mean_adjusted_base_r2_diff) |> ecdf()


p_effect_sizes_per <- p_effect_sizes_per |> mutate(q = ecdf_for_sig(mean_r2), sig_levels ='Bonf.<0.05')
p_effect_sizes_per <- p_effect_sizes_per |> mutate(p_cat = ifelse(is.na(psubcategory), pcategory, psubcategory))
p <- ggplot(adjusted_meta_2, aes(mean_adjusted_base_r2_diff, color=sig_levels))
p <- p + stat_ecdf() + scale_x_continuous(limits=c(0, .05)) +scale_color_aaas()
p <- p + geom_point(data=p_effect_sizes_per, aes(x=mean_r2, y = q, color=sig_levels)) 
p <- p + geom_text_repel(data=p_effect_sizes_per, aes(x=mean_r2, y = q, color=sig_levels, label=p_cat)) 
p <- p + xlab("R^2 (adjusted-base model)") + ylab("percentile") 
p <- p + theme_bw() + theme(legend.position="bottom") 
p
## Warning: Removed 1295 rows containing non-finite values (stat_ecdf).

## reorder sig_levels
adjusted_meta_2 <- adjusted_meta_2 |> mutate(sig_levels=fct_relevel(sig_levels, c("> BY & Bonf.","BY<0.05", "Bonf.<0.05")))

p <- ggplot(adjusted_meta_2, aes(factor(nobs), i.squared,color=sig_levels))
p <- p + geom_boxplot() + xlab("Number of Surveys for PE Association") + theme(legend.position="bottom") + scale_color_aaas()
p <- p + theme_bw() + theme(legend.position="bottom") + ylab("i-squared")
p

i2_medians <- adjusted_meta_2 |> group_by(sig_levels) |> summarize(i2_median=median(i.squared))

Reverse meta-analysis: replicability of P-E

within_survey_pvalue_threshold <- 0.05 # should this be changed for each survey?
expos_wide <- expos_wide |> mutate(p.value_adjusted = p.value_2, estimate_adjusted = estimate_2, std.error_adjusted = std.error_2)
p_val_pe_pair <- expos_wide |> group_by(evarname, pvarname, term) |> summarize(n_pvalue_lt=sum(p.value_adjusted<within_survey_pvalue_threshold), total_n=n(), pct_pvalue_lt=n_pvalue_lt/total_n)
## `summarise()` has grouped output by 'evarname', 'pvarname'. You can override
## using the `.groups` argument.
adjusted_meta_3 <- adjusted_meta_2 |> left_join(p_val_pe_pair, by=c("evarname"="evarname", "pvarname"="pvarname", "expo_name"="term"))

Showcasing associations:

adjusted_meta_3 |> filter(pvalue_bonferroni < 0.05) |> nrow() 
## [1] 4983
adjusted_meta_3 |> filter(pvalue_bonferroni < 0.05, n_pvalue_lt >= 2) |> nrow()
## [1] 0
non_het_pairs <- adjusted_meta_3 |> filter(pvalue_bonferroni < 0.05, n_pvalue_lt >= 2, i.squared < 50, mean_adjusted_base_r2_diff > .025)
het_pairs <- adjusted_meta_3 |> filter(pvalue_bonferroni < 0.05, n_pvalue_lt >= 2, i.squared > 50, mean_adjusted_base_r2_diff > .025, nobs >= 4)
#het_pairs_2 <- adjusted_meta_3 |> filter(sig_levels == 'BY<0.05', i.squared > 90, nobs >= 4) 

adjusted_meta_3 |> filter(pvalue_bonferroni < 0.05) |> group_by(ecategory) |> count()
## non-heterogeneous example



plot_pair <- function(evarname_str, pvarname_str, estimate_limits=c(0.01,.35)) {
  test_1 <- expos_wide |> filter(evarname == evarname_str, pvarname == pvarname_str) |> select(Begin.Year,  exposure_table_name, phenotype_table_name, e_variable_description, p_variable_description, estimate_adjusted, std.error_adjusted, p.value_adjusted) 
  exposure_name <- remove_units_from_string(test_1$e_variable_description[1])
  phenotype_name <- remove_units_from_string(test_1$p_variable_description[1])
  test_1 <- test_1 |> select(Begin.Year, estimate_adjusted, std.error_adjusted) |> rename(estimate=estimate_adjusted, std.error = std.error_adjusted, Survey=Begin.Year) |> mutate(i.squared = NA, i.squared_text='')
meta_test_1 <- adjusted_meta_2 |> filter(evarname == evarname_str, pvarname == pvarname_str) |> mutate(Survey = 'overall') |> select(Survey, estimate, std.error, i.squared) |> mutate(i.squared_text = sprintf("%i%%", round(i.squared)))
  test_1 <- test_1 |> rbind(meta_test_1) 
  test_1 <- test_1 |> mutate(point_shape = ifelse(Survey == 'overall', 23, 21)) |> mutate(point_shape = as.integer(point_shape)) 
  test_1 <- test_1 |> mutate(point_size = ifelse(Survey == 'overall', 7, 2)) 
  p <- ggplot(test_1, aes(Survey, estimate))
  p <- p + geom_point(aes(shape=point_shape, size=point_size, fill=point_shape)) + scale_shape_identity() + scale_size_identity()
  p <- p + geom_text(data=meta_test_1, aes(Survey, estimate, label=i.squared_text), size=3, color="black", nudge_x=.6) 
  p <- p + geom_errorbar(aes(ymin=estimate-1.96*std.error, ymax=estimate+1.96*std.error), width=.1) + scale_x_discrete(limits=rev)
  p <- p + scale_y_continuous(limits=estimate_limits)
  p <- p + coord_flip()  + theme_bw() + theme(legend.position = "none")
  p <- p + ggtitle(sprintf('scale(%s)-scale(log10(%s))', phenotype_name, exposure_name))+ theme(plot.title = element_text(size = 7))
}

## non-heterogeneous example
p1 <- plot_pair('URXP01', 'LBDNENO')
## heterogeneous example
p2 <- plot_pair('LBXGTC', 'BMXBMI')

p3 <- plot_pair('LBXBPB', 'BMXHT', c()) # 33% i2

p4 <- plot_pair('LBXCOT', 'BPXPLS', c()) # 33% i2



#expos_wide |> filter(evarname == 'LBXPFOS', pvarname == 'LBXSAL') 
plot_grid(p1, p2, p3, p4, ncol=2,labels = c('A', 'B', "C", "D"), label_size = 12)

# Examples for paper of top hits

rbind(
  adjusted_meta_2 |> filter(evarname == 'LBXPFHS', pvarname == 'LBXSAL') |> select(tau.squared),
  adjusted_meta_2 |> filter(evarname == 'LBXGTC', pvarname == 'LBXTC') |> select(tau.squared),
  adjusted_meta_2 |> filter(evarname == 'LBXBPB', pvarname == 'BMXWT') |> select(tau.squared),
  adjusted_meta_2 |> filter(evarname == 'LBXCOT', pvarname == 'LBDNENO') |> select(tau.squared)
)
adjusted_meta_2 |> filter(evarname == 'LBXPFHS') |> group_by(sig_levels) |> summarize(sd_estimate=sd(estimate))
adjusted_meta_2 |> filter(evarname == 'LBXGTC') |> group_by(sig_levels) |> summarize(sd_estimate=sd(estimate))
adjusted_meta_2 |> filter(evarname == 'LBXBPB') |> group_by(sig_levels) |> summarize(sd_estimate=sd(estimate))
adjusted_meta_2 |> filter(evarname == 'LBXCOT') |> group_by(sig_levels) |> summarize(sd_estimate=sd(estimate))
p1 <- plot_pair('LBXPFHS', 'LBXSAL') + ylab("Association Estimate")
## heterogeneous example
p2 <- plot_pair('LBXGTC', 'LBXTC') + ylab("Association Estimate")

p3 <- plot_pair('LBXBPB', 'BMXWT', c(-.3, 0)) + ylab("Association Estimate")

p4 <- plot_pair('LBXCOT', 'LBDNENO')+ ylab("Association Estimate")

plot_grid(p1, p2, p3, p4, ncol=2,labels = c('A', 'B', "C", "D"), label_size = 12)

library(ggridges)
## show histogram of associations for all top findings for  LBXGTC, "LBXBPB", "LBXPFHS", "LBXCOT"
ep_candidates <-  tibble(evarname = c("LBXGTC", "LBXBPB", "LBXPFHS", "LBXCOT"), 
                         pvarname = c("LBXTC", "BMXWT", "LBXSAL", "LBDNENO"))
exposure_dist <- adjusted_meta_2 |> filter(evarname %in% ep_candidates$evarname) |> filter(sig_levels == "Bonf.<0.05") |> mutate(evardesc = remove_units_from_string(evardesc))

## collect Survey specific associations
survey_exposure_pts <- vector(mode = "list", length=nrow(ep_candidates))
for(rw_num in 1:nrow(ep_candidates)) {
  survey_exposure_pts[[rw_num]] <- expos_wide |> filter(evarname == ep_candidates$evarname[rw_num], pvarname == ep_candidates$pvarname[rw_num]) |> 
  select(Begin.Year,  evarname, pvarname, exposure_table_name, phenotype_table_name, e_variable_description, p_variable_description, estimate_adjusted, std.error_adjusted, p.value_adjusted) 
}

survey_exposure_pts <- survey_exposure_pts |> bind_rows() |> mutate(evardesc=remove_units_from_string(e_variable_description))

exposure_dist <- exposure_dist |> mutate(evarname = substr(evarname, 4, 8)) 
survey_exposure_pts <- survey_exposure_pts |> mutate(pvarname = substr(pvarname, 4, 8), evarname = substr(evarname, 4, 8))

p <- ggplot(exposure_dist, aes(y=evarname, x=estimate, fill=evarname))
p <- p + geom_density_ridges()+ scale_fill_jama(guide="none") #+ scale_colour_continuous(guide = "none")
p <- p + geom_point(data=survey_exposure_pts, aes(y=evarname, x=estimate_adjusted, color=factor(Begin.Year))) + scale_color_aaas(name = "")
pheno_het <- p + theme_bw()  + theme(legend.position='bottom') + ylab("Exposure Name") + xlab("Association Estimate")

survey_het <- plot_grid(p1, p2, p3, p4, ncol=2,labels = c('A', 'B', "C", "D"), label_size = 12)

p <- plot_grid(survey_het, pheno_het, ncol=2, labels=c("", "E"), label_size = 12)
## Picking joint bandwidth of 0.0348

R2 adjusted vs. non-adjusted

p <- ggplot(expos_wide, aes(rsq_2_1, rsq_2_0))
p <- p + geom_point(shape='.') + xlab("R2 Base [Demographics]") + ylab("R2 [Exposure + Demographics]")
p <- p + theme_bw()
p

p <- ggplot(expos_wide, aes(rsq_2_0, rsq_1_0))
p <- p + geom_point(shape='.') + xlab("R2 Adjusted [Exposure + Demographics]") + ylab("R2 Unadjusted [Exposure]")
p <- p + theme_bw() 
p

p <- ggplot(adjusted_meta_2, aes(mean_adjusted_base_r2_diff, i.squared))
p <- p + geom_point(shape='.') + xlab("R2 Adjusted [Exposure + Demographics]") + ylab("i.squared") + scale_x_continuous(limits=c(0, .01))
p <- p + theme_bw() + facet_grid(~sig_levels)
p
## Warning: Removed 3752 rows containing missing values (geom_point).

Estimate and P-values: adjusted vs. non-adjusted

p <- ggplot(expos_wide, aes(estimate_2, estimate_1))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(-2, 2)) + scale_y_continuous(limits=c(-2, 2)) + xlab("Adjusted [Exposure + Demographics]") + ylab("Unadjusted [Exposure]") + geom_abline()
p <- p + geom_smooth(method="lm")
p <- p + theme_bw()
p
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 161 rows containing non-finite values (stat_smooth).
## Warning: Removed 161 rows containing missing values (geom_point).

## how much bias?
tidy(lm(estimate_2  ~ estimate_1, expos_wide)) # biased to be lower (intercept is negative, and slope is less than 1)
# 


p <- ggplot(expos_wide, aes(-log10(p.value_2), -log10(p.value_1)))
p <- p + geom_point(shape='.', alpha=.1) + xlab("Adjusted model pvalue [Exposure + Demographics]") + ylab("Unadjusted Pvalue [Exposure]")
p <- p + geom_abline()
p <- p + theme_bw()
p
## Warning: Removed 20562 rows containing missing values (geom_point).

p <- ggplot(expos_wide, aes(statistic_2, statistic_1))
p <- p + geom_point(shape='.', alpha=.1) + xlab("Adjusted model z [Exposure + Demographics]") + ylab("Unadjusted z [Exposure]")
p <- p + geom_abline() + scale_y_continuous(limit=c(-20, 20)) + scale_x_continuous(limits=c(-20,20))
p <- p + theme_bw()
p
## Warning: Removed 1871 rows containing missing values (geom_point).

## 

adjustment_number_order <- rbind(
  tibble(model_number=1, scenario="base", order_number=1),
  tibble(model_number=5, scenario="sex", order_number=2),
  tibble(model_number=4, scenario="age", order_number=3),
  tibble(model_number=3, scenario="age_sex", order_number=4),
  tibble(model_number=9, scenario="ethnicity", order_number=5),
  tibble(model_number=6, scenario="age_sex_ethnicity", order_number=6),
  tibble(model_number=8, scenario="income_education", order_number=7),
  tibble(model_number=7, scenario="age_sex_income_education", order_number=8),
  tibble(model_number=2, scenario="age_sex_ethnicity_income_education", order_number=9)
)


#bonf_thresh <- 0.05/nrow(adjusted_meta_2)
adjusted_meta_tp <- adjusted_meta |> filter(nobs >= 2 ) |> select(evarname, pvarname, model_number,term_name, estimate, std.error, statistic, p.value) |> collect()

#adjusted_meta_full <- adjusted_meta_tp |> filter(model_number == 2) |> rename(estimate_adjusted=estimate, p.value_adjusted = p.value, statistic_adjusted=statistic) |> select(-model_number) 

adjusted_meta_full <- adjusted_meta_2 |> rename(estimate_adjusted=estimate, p.value_adjusted = p.value, statistic_adjusted=statistic) |> select(-model_number) 

adjusted_meta_tp <- adjusted_meta_tp  |> left_join(adjusted_meta_full, by=c("evarname", "pvarname", "term_name"))
adjusted_meta_tp <- adjusted_meta_tp  |> left_join(adjustment_number_order |> select(c(model_number, scenario, order_number)), by="model_number")

p <- ggplot(adjusted_meta_tp, aes(statistic_adjusted, statistic))
p <- p + geom_point(shape='.') + facet_wrap(~scenario) + geom_abline(slope=1, intercept = 0) + theme_bw()
p 
## Warning: Removed 45 rows containing missing values (geom_point).

p <- ggplot(adjusted_meta_tp, aes(estimate_adjusted, estimate))
p <- p + geom_point(shape='.') + facet_wrap(~scenario) + theme_bw()
p
## Warning: Removed 45 rows containing missing values (geom_point).

# adjusted_meta_tp 
am2 <- adjusted_meta_tp |> filter(model_number == 1) |> filter(!is.na(sig_levels))
p <- ggplot(am2 , aes(estimate_adjusted, estimate, color=sig_levels)) 
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(-1, 1)) + scale_y_continuous(limits=c(-1, 1)) + scale_color_aaas()
p <- p + geom_abline()
p <- p + facet_grid(~sig_levels) + xlab("Adjusted model estimate [Exposure + Demographics]") + ylab("Unadjusted estimate [Exposure]")
p <- p + geom_smooth(method="lm")
p <- p + theme_bw() +theme(legend.position = "none") ## need to get sig_levels
p
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 160 rows containing non-finite values (stat_smooth).
## Warning: Removed 160 rows containing missing values (geom_point).

# 
# 
# 
tidy(lm(estimate_adjusted ~ estimate, am2))
tidy(lm(estimate_adjusted ~ estimate, am2 |> filter(pvalue_bonferroni < .05)))
tidy(lm(estimate_adjusted ~ estimate, am2 |> filter(sig_levels == '> BY & Bonf.')))
tidy(lm(estimate_adjusted ~ estimate, am2 |> filter(sig_levels == 'BY<0.05')))
# 
p <- ggplot(adjusted_meta_tp |> filter(model_number == 1, !is.na(ecategory)) , aes(estimate, estimate_adjusted, color=ecategory))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(-1, 1)) + scale_y_continuous(limits=c(-1, 1)) + scale_color_aaas()
p <- p + geom_abline()
p <- p + facet_grid(~ecategory) + ylab("Adjusted model estimate [Exposure + Demographics]") + xlab("Unadjusted estimate [Exposure]")
p <- p + geom_smooth(method="lm")
p <- p + theme_bw() +theme(legend.position = "none")
p
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 160 rows containing non-finite values (stat_smooth).
## Removed 160 rows containing missing values (geom_point).

p <- ggplot(adjusted_meta_tp |> filter(model_number == 1, !is.na(pcategory)) , aes(estimate, estimate_adjusted, color=pcategory))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(-1, 1)) + scale_y_continuous(limits=c(-1, 1)) + scale_color_aaas()
p <- p + geom_abline()
p <- p + facet_grid(~pcategory) + ylab("Adjusted model estimate [Exposure + Demographics]") + xlab("Unadjusted estimate [Exposure]")
p <- p + geom_smooth(method="lm")
p <- p + theme_bw() +theme(legend.position = "none")
p
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 160 rows containing non-finite values (stat_smooth).
## Removed 160 rows containing missing values (geom_point).